import sys
print(f"Interpreter dir: {sys.executable}")
import os
if os.path.basename(os.getcwd()) == "notebooks":
os.chdir("../")
print(f"Working dir: {os.getcwd()}")
%load_ext autoreload
%autoreload 2
# General
from pathlib import Path
# Data & modeling
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import lightgbm as lgb
from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score
from sklearn.neural_network import MLPRegressor
# PLotting
import matplotlib.pyplot as plt
import plotly
# plotly.offline.init_notebook_mode(connected=True)
# Custom package
from storpdm import DATA_PATH
from storpdm.make_dataset import download_dataset, import_dataset
from storpdm.visualize import (
visualise_sensor_correlation_all_engine,
visualise_sensor_data_distribution,
plot_time_history_all_engines,
)
from storpdm.build_features import (
find_correlated_data,
list_correlated_data,
find_time_independent_columns,
add_calculated_rul,
prepare_training_data,
)
from storpdm.train_model import EstimatorSelectionHelper
Data sets consists of multiple multivariate time series. Each data set is further divided into training and test subsets. Each time series is from a different engine, i.e., the data can be considered to be from a fleet of engines of the same type. Each engine starts with different degrees of initial wear and manufacturing variation which is unknown to the user. This wear and variation is considered normal, i.e., it is not considered a fault condition. There are three operational settings that have a substantial effect on engine performance. These settings are also included in the data. The data is contaminated with sensor noise.
The engine is operating normally at the start of each time series, and develops a fault at some point during the series: the fault grows in magnitude until system failure.
The aim is to model and predict the Remaining Time of Failure (RUL) as accurately as possible. In a practical setup, such model can be used for:
The general modeling outline is described in the diagram below and is as follows:
In this notebook we model the failures from two different points of view
from IPython.display import SVG
SVG(filename='reports/data flow.svg')
## Run the download function only once
if len(list(Path('data/raw').glob('train_*.txt'))) == 4:
print('Raw data has been downloaded')
else:
download_dataset()
# Load datasets
df_rul, df_train, df_test = import_dataset(filename="FD001")
display(df_train)
plt.figure(figsize=(15, 15))
visualise_sensor_correlation_all_engine(df_train)
# Reduce / Eliminate highly-correlated sensors.
correlation_threshold = 0.95
correlated_data = find_correlated_data(df_train, correlation_threshold)
columns_to_be_removed = list_correlated_data(correlated_data)
print(f"Removing {columns_to_be_removed} from columns")
df_train_proc = df_train.drop(columns_to_be_removed, axis=1)
# Visualise correlations, extreme ones should have been removed
plt.figure(figsize=(15, 15))
visualise_sensor_correlation_all_engine(df_train_proc)
plt.figure(figsize=(15, 15))
fig = visualise_sensor_data_distribution(df_train_proc)
# Remove data that does not change with time.
time_independent_columns = find_time_independent_columns(df_train_proc)
print(f"Removing constant columns: {time_independent_columns}")
df_train_proc2 = df_train_proc.drop(time_independent_columns, axis=1)
display(df_train_proc2)
# Add Remaining Useful Life (RUL) to dataset.
df_train_proc2 = add_calculated_rul(df_train_proc2)
# Visualise sensor behaviour against RUL.
plt.figure(figsize=(35, 17))
fig = plot_time_history_all_engines(df_train_proc2)
from storpdm.visualize import interactive_rul_series
fig = interactive_rul_series(df_train_proc2, filename='reports/series.html')
We perform a comparison of 3 different models with several combinations of hyperparameters and automatically select the best one
#Â Display processed dataset
df_train_proc2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
# Define models
models = {
#"rf": RandomForestRegressor(), #Â 3 minutes
"lgb": lgb.LGBMRegressor(),
"linear": Ridge(normalize = True),
"mlp": Pipeline([("std",MinMaxScaler()),
("m1",MLPRegressor(early_stopping=True))])
}
# Define hyperparam search space
params = {
#"rf": {"n_estimators": [100, 500]},
"lgb": {
"n_estimators": [100, 500],
"learning_rate": [0.005, 0.01],
"max_depth":[5,10,15,20]
},
"mlp": {
"m1__hidden_layer_sizes": [(128),(64, 32)],
"m1__learning_rate_init": [0.001,0.005,0.01],
},
"linear": {},
}
X_train, X_test, y_train, y_test = prepare_training_data(
df=df_train_proc2, target_col="RUL", discard_cols="id"
)
%%time
estim_grid = EstimatorSelectionHelper(models, params)
estim_grid.fit(X_train, y_train, scoring="neg_root_mean_squared_error", n_jobs=2, cv=None)
estim_grid.score_summary()
model = estim_grid.best_model
model
y_test_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Random Forest Regression test MAE: {mae:.3f}")
print(f"Random Forest Regression test RMSE: {rmse:.3f}")
# Predictions on "test" dataset # TODO
# Not so interesting for this exercise
df_plot = pd.DataFrame({"y_test":y_test, "y_test_pred": y_test_pred,'cycle':X_test.cycle})
df_plot = df_plot.sort_values('y_test')
# TODO: plot predicted RUL vs actual RUL
# Order by RUL, should be a nice graph
import numpy as np
import plotly.graph_objects as go
# Create traces
fig = go.Figure()
fig.add_trace(
go.Scattergl(
x=df_plot["y_test"],
y=df_plot["y_test_pred"],
mode="markers",
marker=dict(
color=np.abs(df_plot["y_test"] - df_plot["y_test_pred"]),
colorscale="Viridis",
line_width=0,
opacity=0.7,
size=6,
),
name="lines",
)
)
fig.add_shape(type="line", x0=0, y0=0, x1=350, y1=350, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
title="Actual vs predicted RUL",
xaxis_title="Actual RUL",
yaxis_title="Predicted"
)
fig.show()
Display the predicted vs actual RUL for a couple of engines
y_train_pred = model.predict(df_train_proc2.drop(['RUL','id'],axis=1))
df_plot = pd.DataFrame({"y_train":df_train_proc2.RUL, "y_train_pred": y_train_pred,'id':df_train_proc2.id})
# Create traces
fig = go.Figure()
engine_list = [96,39,80,71]
for engine in engine_list:
idx = df_plot.id == engine
df_plot_filter = df_plot[idx]
fig.add_trace(
go.Scattergl(
x=df_plot_filter["y_train"],
y=df_plot_filter["y_train_pred"],
#mode="markers",
marker=dict(
color=np.abs(df_plot_filter["y_train"] - df_plot_filter["y_train_pred"]),
colorscale="Viridis",
line_width=0,
opacity=0.7,
size=6,
),
name=f"Enigne {engine}",
)
)
fig.add_shape(type="line", x0=0, y0=0, x1=350, y1=350, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
title="Actual vs predicted RUL",
xaxis_title="Actual RUL",
yaxis_title="Predicted"
)
fig.show()
# TODO: Shap feature importance ---> FIXME
import shap
data_shap = shap.kmeans(X_train, 50)
explainer = shap.KernelExplainer(model.predict, data=data_shap)
X_sample = X_train.sample(100)
shap_values = explainer.shap_values(X_sample, l1_reg = 'num_features(100)')
# summarize the effects of all the features
shap.summary_plot(shap_values, X_sample)
from pdpbox import pdp, get_dataset, info_plots
features = ['cycle','StaticHPCOutletPres','PhysCoreSpeed','LPCOutletTemp','FuelFlowRatio']
for f in features:
pdp_goals = pdp.pdp_isolate(model = model,
dataset = X_sample,
model_features = X_train.columns,
feature = f)
pdp.pdp_plot(pdp_goals, f, x_quantile=False, plot_pts_dist=True, plot_lines=True,center=True)
cycle_threshold = 10
df_train_proc2['RUL_thres'] = np.where(df_train_proc2['RUL'] <= cycle_threshold, 1, 0)
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
# Define models
models = {
#"rf": RandomForestRegressor(), #Â 3 minutes
"lgb": lgb.LGBMClassifier(),
"logistic": LogisticRegression(max_iter=300),
"mlp": Pipeline([("std",MinMaxScaler()),
("m1",MLPClassifier(early_stopping=True))])
}
# Define hyperparam search space
params = {
#"rf": {"n_estimators": [100, 500]},
"lgb": {
"n_estimators": [100, 500],
"learning_rate": [0.005, 0.01],
"max_depth":[5,10,15,20]
},
"mlp": {
"m1__hidden_layer_sizes": [(128),(64, 32)],
"m1__learning_rate_init": [0.001,0.005,0.01],
},
"logistic": {},
}
X_train, X_test, y_train, y_test = prepare_training_data(
df=df_train_proc2, target_col="RUL_thres", discard_cols=["id","RUL"]
)
%%time
estim_grid_clf = EstimatorSelectionHelper(models, params)
estim_grid_clf.fit(X_train, y_train, scoring="f1", n_jobs=2, cv=None)
estim_grid_clf.score_summary().head()
model = estim_grid_clf.best_model
y_test_pred = model.predict(X_test)
f1 = f1_score(y_test, y_test_pred)
print(f"F1 score test: {f1:.3f}")
from storpdm.utils import metrics_threshold
from storpdm.build_features import prepare_training_data
from storpdm.visualize import display_roc_pr
y_score = model.predict_proba(X_test)
metrics = metrics_threshold(y_score[:,1], y_test)
display_roc_pr(precision = metrics['prec'],
recall = metrics['rec'],
fpr = metrics['fpr'],
thres = metrics['thres'])
# TODO: display table too
display(
pd.DataFrame(
{
"thresholds": metrics["thres"],
"precision": metrics["prec"],
"recall": metrics["rec"],
"fpr": np.round(metrics["fpr"],3),
"TP": metrics["tp"],
"TN": metrics["tn"],
"FP": metrics["fp"],
"FN": metrics["fn"],
}
)
.loc[::10]
.loc[::-1]
)
First we define the following metrics:
Looking at the ROC-Precision graph above, we can choose an optimal threshold based on the business needs.
A higher threshold (for example, 0.8) means there fpr is 0.003, 0.928 precision and 0.637 recall. This means we are very confident of our predictions but 0.363% of the failures we miss. This threshold is optimal if the cost
Let's look at the confusion matrix with a threshold of 0.5:
from storpdm.visualize import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(confusionMatrix, normalize = True, classes=["No failure (N)","Failure within 10 days (P)"])
plot_confusion_matrix(confusionMatrix, normalize = True, classes=["No failure (N)","Failure within 10 days (P)"])
from plotly.subplots import make_subplots
def display_tp(thres,tp, fp, title="", fig=None, i=0, name1 = "", name2 = ""):
colors = [
"#1f77b4", # muted blue
"#ff7f0e", # safety orange
"#2ca02c", # cooked asparagus green
"#d62728", # brick red
"#9467bd", # muted purple
"#8c564b", # chestnut brown
"#e377c2", # raspberry yogurt pink
"#7f7f7f", # middle gray
"#bcbd22", # curry yellow-green
"#17becf", # blue-teal
]
hovertext = [
f"TP: {r}<br>FP: {p}<br>Thres: {t:.2f}"
for r, p, t in zip(tp,fp, thres)
]
# ROC curve Train
lw = 2
fig = make_subplots(specs=[[{"secondary_y": True}]])
opacity=0.8
## PRecision
fig.add_trace(
go.Bar(
x=thres[::10],
y=tp[::10],
name=name1,
hovertext=hovertext[::10],
hoverinfo="text",
marker_color =colors[9],
opacity=opacity,
xaxis='x1',
yaxis='y1'
),
secondary_y=False,
)
fig.add_trace(
go.Scatter(
x=thres,
y=tp,
mode="lines",
name=name1,
hovertext=hovertext,
hoverinfo="text",
line=dict(color=colors[9]),
showlegend=False,
xaxis='x1',
yaxis='y1'
),
secondary_y=False
)
## recall curve
fig.add_trace(
go.Bar(
x=thres[::10],
y=fp[::10],
# mode="lines",
name=name2,
hovertext=hovertext[::10],
hoverinfo="text",
marker_color =colors[4],
opacity=opacity,
xaxis='x1',
yaxis='y1'
),
secondary_y=False
)
fig.add_trace(
go.Scatter(
x=thres,
y=fp,
mode="lines",
name=name2,
hovertext=hovertext,
hoverinfo="text",
line=dict(color=colors[4]),
showlegend=False,
xaxis='x1',
yaxis='y1'
),
secondary_y=False
)
fig.update_yaxes(title_text="")
fig.update_xaxes(title_text="Limite probabilidad modelo")
fig.update_yaxes(range=[0, 210])
fig.update_xaxes(range=[.25, 0.95])
fig.update_layout(height=500)
return fig
fig = display_tp(
thres=metrics["thres"],
tp=metrics["tp"],
fp=metrics["fp"],
title="",
fig=None,
i=0,
name1 = "AverÃas detectadas",
name2 = 'Falsos positivos'
)
#plotly.offline.plot(fig, filename='reports/comparativa_metricas.html')
fig
y_train_pred = estim_grid_clf.best_model.predict_proba(df_train_proc2.drop(['RUL','id','RUL_thres'],axis=1))[:,1]
df_plot = pd.DataFrame({"y_train":df_train_proc2.RUL, "y_train_pred": y_train_pred,'id':df_train_proc2.id})
# Create traces
fig = go.Figure()
engine = 96
idx = df_plot.id == engine
df_plot_filter = df_plot[idx]
fig.add_trace(
go.Bar(
x=df_plot_filter["y_train"],
y=df_plot_filter["y_train_pred"],
name=f"Enigne {engine}",
)
)
fig.add_shape(type="line", x0=10, y0=0, x1=10, y1=1, line=dict(width=4, dash="dot",color='red'))
fig.update_layout(
title=f"Predicted probabilty of failure vs RUL engine id = {engine}",
xaxis_title="RUL",
yaxis_title="Predicted probability"
)
fig.show()
fig.show(renderer="notebook")